/* generate data for Tables 4 and 5, which include health majors */
libname edu '!userprofile\\Dropbox\Education\Replication\Data';

* get data;
proc sql;
  create table regression2 as
  select distinct year, major
  from edu.skew_bio;
quit;

proc sql;
  create table enrolment_skew_return as
  select a.*, b.skew1e_major
  from regression2 as a, edu.skew_bio as b
  where 1 <= a.year - b.year <= 2 and a.major = b.major and a.major ~= .;
quit;

proc sql;
  create table enrolment_skew_return2 as
  select a.*, b.skew1e_major
  from regression2 as a, edu.skew_bio as b
  where 3 <= a.year - b.year <= 7 and a.major = b.major and a.major ~= .;
quit;

* calculate the average;
proc sort data = enrolment_skew_return; by year major major; run;

proc means data = enrolment_skew_return noprint;
  by year major major;
  var skew1e_major;
  output out = return mean = skew1e_major_avg1;
run;

proc sort data = enrolment_skew_return2; by year major major; run;

proc means data = enrolment_skew_return2 noprint;
  by year major major;
  var skew1e_major;
  output out = return2 mean = skew1e_major_avg2;
run;

* create a dataset for regressions;
proc sql;
  create table regression2 as
  select a.*, b.skew1e_major_avg1
  from regression2 as a left join return as b
  on a.year = b.year and a.major = b.major and a.major ~= .;
quit;

proc sql;
  create table regression2 as
  select a.*, b.skew1e_major_avg2
  from regression2 as a left join return2 as b
  on a.year = b.year and a.major = b.major and a.major ~= .;
quit;

* add time-series measures;
proc sql;
  create table temp as
  select distinct a.*, b.tsmean2_major as tsmean_major_avg1, b.tsvol2_major as tsvol_major_avg1
  from regression2 as a left join edu.skew_bio as b
  on a.year = b.year+1 and a.major = b.major;
quit;

proc sql;
  create table temp as
  select distinct a.*, b.tsmean5_major as tsmean_major_avg2, b.tsvol5_major as tsvol_major_avg2
  from temp as a left join edu.skew_bio as b
  on a.year = b.year+3 and a.major = b.major;
quit;

data regression_avg2; set temp; run;

data edu.regression_avg2e_major_bio (drop = year major);
  set regression_avg2;
  year_ending = year;
  majorcode = major;
  if year_ending < 1966 or year_ending > 2017 then delete;
  if skew1e_major_avg2 = . or tsmean_major_avg2 = . or tsvol_major_avg2 = . or
  skew1e_major_avg1 = . or tsmean_major_avg1 = . or tsvol_major_avg1 = . then delete;
run;
